df <- read.csv("data/GermanCredit.csv", header=TRUE, sep=";")
# Conversion des données catégorielles en factor
df$CHK_ACCT <- as.factor(df$CHK_ACCT)
df$HISTORY <- as.factor(df$HISTORY)
df$SAV_ACCT <- as.factor(df$SAV_ACCT)
df$EMPLOYMENT <- as.factor(df$EMPLOYMENT)
df$PRESENT_RESIDENT <- as.factor(df$PRESENT_RESIDENT)
df$JOB <- as.factor(df$JOB)Projet VID
Données sur les crédits Allemands
Introduction
Dans ce projet il nous est demandé de trouver les variables permettant d’obtenir le meilleur modèle de régression linéaire multiple pour déterminer si c’est une bonne idée de faire un crédit bancaire à une personne.
Analyse exploratoire des données
Vérification des données
summary(df) OBS. CHK_ACCT DURATION HISTORY NEW_CAR
Min. : 1.0 0:274 Min. :-6.00 0: 40 Min. :0.000
1st Qu.: 250.8 1:269 1st Qu.:12.00 1: 49 1st Qu.:0.000
Median : 500.5 2: 63 Median :18.00 2:530 Median :0.000
Mean : 500.5 3:394 Mean :20.89 3: 88 Mean :0.234
3rd Qu.: 750.2 3rd Qu.:24.00 4:293 3rd Qu.:0.000
Max. :1000.0 Max. :72.00 Max. :1.000
USED_CAR FURNITURE RADIO.TV EDUCATION RETRAINING
Min. :0.000 Min. :0.000 Min. :0.00 Min. :0.00 Min. :0.000
1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.000
Median :0.000 Median :0.000 Median :0.00 Median :0.00 Median :0.000
Mean :0.103 Mean :0.181 Mean :0.28 Mean :0.05 Mean :0.097
3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:1.00 3rd Qu.:0.00 3rd Qu.:0.000
Max. :1.000 Max. :1.000 Max. :1.00 Max. :1.00 Max. :1.000
AMOUNT SAV_ACCT EMPLOYMENT INSTALL_RATE MALE_DIV
Min. : 250 0:603 0: 62 Min. :1.000 Min. :0.00
1st Qu.: 1366 1:103 1:172 1st Qu.:2.000 1st Qu.:0.00
Median : 2320 2: 63 2:339 Median :3.000 Median :0.00
Mean : 3271 3: 48 3:174 Mean :2.973 Mean :0.05
3rd Qu.: 3972 4:183 4:253 3rd Qu.:4.000 3rd Qu.:0.00
Max. :18424 Max. :4.000 Max. :1.00
MALE_SINGLE MALE_MAR_or_WID CO.APPLICANT GUARANTOR
Min. :0.000 Min. :0.000 Min. :0.000 Min. :-1.000
1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.: 0.000
Median :1.000 Median :0.000 Median :0.000 Median : 0.000
Mean :0.549 Mean :0.092 Mean :0.041 Mean : 0.051
3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.: 0.000
Max. :2.000 Max. :1.000 Max. :1.000 Max. : 1.000
PRESENT_RESIDENT REAL_ESTATE PROP_UNKN_NONE AGE
1:130 Min. :0.000 Min. :0.000 Min. : 19.00
2:308 1st Qu.:0.000 1st Qu.:0.000 1st Qu.: 27.00
3:149 Median :0.000 Median :0.000 Median : 33.00
4:413 Mean :0.282 Mean :0.154 Mean : 35.53
3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.: 42.00
Max. :1.000 Max. :1.000 Max. :151.00
NA's :14
OTHER_INSTALL RENT OWN_RES NUM_CREDITS JOB
Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.000 0: 22
1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:1.000 1:200
Median :0.000 Median :0.000 Median :1.000 Median :1.000 2:630
Mean :0.186 Mean :0.179 Mean :0.713 Mean :1.407 3:148
3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:2.000
Max. :1.000 Max. :1.000 Max. :1.000 Max. :4.000
NUM_DEPENDENTS TELEPHONE FOREIGN RESPONSE
Min. :1.000 Min. :0.000 Min. :0.000 Min. :0.0
1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0
Median :1.000 Median :0.000 Median :0.000 Median :1.0
Mean :1.155 Mean :0.404 Mean :0.037 Mean :0.7
3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:1.0
Max. :2.000 Max. :1.000 Max. :1.000 Max. :1.0
library(ggplot2)
library(gridExtra)
# Liste des colonnes sans "OBS."
columns_to_plot <- setdiff(names(df), c("OBS."))
# Liste des graphiques
plots <- list()
# Boucle pour faire un graphique par colonne
for (col in columns_to_plot) {
# Création du graphique
p <- ggplot(df, aes_string(x = col)) +
geom_bar() +
ggtitle(paste("Colonne: ", col)) +
xlab(col) +
theme_minimal()
# Ajout du graphique à la liste
plots[[col]] <- p
}
# Affichage des graphiques (2 colonnes)
do.call(grid.arrange, c(plots, ncol = 2))Voici les données observées que ne jouent pas avec la donnée: valeur min DURATION -6?, MALE_SINGLE une valeur à 2, GUARANTOR -1, PRESENT_RESIDENT valeurs de 1 à 4 alors que la données dit valeurs 0 à 3, valeur max AGE 151. Pour AGE il y a 14 valeurs manquantes.
Voici les corrections validées par le client:
Correction des erreurs dans les données
Premier modèle de régression multiple
df.lm <- lm(formula = RESPONSE ~ . - OBS., data = df)
summary(df.lm)
Call:
lm(formula = RESPONSE ~ . - OBS., data = df)
Residuals:
Min 1Q Median 3Q Max
-1.06285 -0.30776 0.08616 0.28136 0.90095
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.732e-01 1.771e-01 4.367 1.4e-05 ***
CHK_ACCT1 8.250e-02 3.694e-02 2.233 0.025765 *
CHK_ACCT2 1.926e-01 5.817e-02 3.311 0.000964 ***
CHK_ACCT3 2.830e-01 3.487e-02 8.117 1.5e-15 ***
DURATION -4.514e-03 1.517e-03 -2.976 0.002991 **
HISTORY1 4.700e-03 9.001e-02 0.052 0.958366
HISTORY2 1.455e-01 7.084e-02 2.054 0.040292 *
HISTORY3 1.976e-01 7.799e-02 2.534 0.011445 *
HISTORY4 2.608e-01 7.164e-02 3.641 0.000287 ***
NEW_CAR -1.179e-01 6.094e-02 -1.935 0.053339 .
USED_CAR 9.934e-02 6.995e-02 1.420 0.155902
FURNITURE 3.491e-03 6.363e-02 0.055 0.956266
RADIO.TV 1.124e-02 6.069e-02 0.185 0.853160
EDUCATION -1.433e-01 8.013e-02 -1.788 0.074069 .
RETRAINING -1.494e-02 7.012e-02 -0.213 0.831349
AMOUNT -1.556e-05 7.239e-06 -2.150 0.031807 *
SAV_ACCT1 4.518e-02 4.442e-02 1.017 0.309438
SAV_ACCT2 8.090e-02 5.471e-02 1.479 0.139578
SAV_ACCT3 1.569e-01 6.209e-02 2.528 0.011637 *
SAV_ACCT4 1.205e-01 3.576e-02 3.369 0.000785 ***
EMPLOYMENT1 1.725e-03 7.011e-02 0.025 0.980373
EMPLOYMENT2 5.776e-02 6.688e-02 0.864 0.387988
EMPLOYMENT3 1.303e-01 6.999e-02 1.861 0.062989 .
EMPLOYMENT4 6.912e-02 6.676e-02 1.035 0.300754
INSTALL_RATE -4.407e-02 1.326e-02 -3.324 0.000923 ***
MALE_DIV -6.800e-02 6.319e-02 -1.076 0.282187
MALE_SINGLE 7.602e-02 3.208e-02 2.370 0.017987 *
MALE_MAR_or_WID 2.539e-02 4.953e-02 0.513 0.608342
CO.APPLICANT -6.803e-02 6.574e-02 -1.035 0.301023
GUARANTOR 1.770e-01 5.928e-02 2.986 0.002896 **
PRESENT_RESIDENT2 -1.237e-01 4.438e-02 -2.787 0.005419 **
PRESENT_RESIDENT3 -6.385e-02 4.994e-02 -1.279 0.201341
PRESENT_RESIDENT4 -5.693e-02 4.497e-02 -1.266 0.205859
REAL_ESTATE 3.227e-02 3.189e-02 1.012 0.311889
PROP_UNKN_NONE -8.947e-02 5.976e-02 -1.497 0.134657
AGE 1.653e-03 1.298e-03 1.273 0.203350
OTHER_INSTALL -8.211e-02 3.489e-02 -2.353 0.018820 *
RENT -1.062e-01 7.303e-02 -1.454 0.146390
OWN_RES -3.417e-02 7.024e-02 -0.486 0.626785
NUM_CREDITS -3.908e-02 2.803e-02 -1.394 0.163515
JOB1 -8.530e-02 1.033e-01 -0.825 0.409399
JOB2 -9.774e-02 1.006e-01 -0.971 0.331615
JOB3 -7.416e-02 1.023e-01 -0.725 0.468507
NUM_DEPENDENTS -3.964e-02 3.820e-02 -1.038 0.299742
TELEPHONE 5.015e-02 2.968e-02 1.690 0.091422 .
FOREIGN 1.622e-01 7.035e-02 2.306 0.021323 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3972 on 940 degrees of freedom
(14 observations deleted due to missingness)
Multiple R-squared: 0.2853, Adjusted R-squared: 0.2511
F-statistic: 8.34 on 45 and 940 DF, p-value: < 2.2e-16
library(ggResidpanel)
resid_interact(df.lm, plots = c("resid", "qq", "cookd", "boxplot"))Recherche du meilleur modèle
```{r}
library(leaps)
Best_Subset <- regsubsets(RESPONSE~., data = df, nbest = 1, nvmax = NULL, force.in = NULL, force.out = "OBS.", method = "exhaustive")
summary_best_subset <- summary(regsubsets.out)
as.data.frame(summary_best_subset$outmat)
which.max(summary_best_subset$adjr2)
summary_best_subset$which[13,]
``````{r}
library(leaps)
models <- regsubsets(RESPONSE ~ ., data=df, nvmax=NULL, force.out = "OBS.")
summary(models)
res.sum <- summary(models)
data.frame(
Adj.R2 = which.max(res.sum$adjr2),
CP = which.min(res.sum$cp),
BIC = which.min(res.sum$bic)
)
``````{r}
get_model_formula <- function(id, object, outcome){
# get models data
models <- summary(object)$which[id,-1]
# Get outcome variable
form <- as.formula(object$call[[2]])
outcome <- all.vars(form)[1]
# Get model predictors
predictors <- names(which(models == TRUE))
predictors <- paste(predictors, collapse = "+")
# Build model formula
as.formula(paste0(outcome, "~", predictors))
}
#lm10 <- lm(get_model_formula(10, models, "RESPONSE"), data=df)
#summary(lm10)
#lm18 <- lm(get_model_formula(18, models, "RESPONSE"), data=df)
#summary(lm18)
#lm23 <- lm(get_model_formula(23, models, "RESPONSE"), data=df)
#summary(lm23)
```